{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Conditional Autoencoder for Asset Pricing - Data Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:28.261069Z",
     "start_time": "2020-06-22T14:15:27.707077Z"
    }
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from statsmodels.regression.rolling import RollingOLS\n",
    "import statsmodels.api as sm\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:28.263990Z",
     "start_time": "2020-06-22T14:15:28.262126Z"
    }
   },
   "outputs": [],
   "source": [
    "idx = pd.IndexSlice\n",
    "sns.set_style('whitegrid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:28.278110Z",
     "start_time": "2020-06-22T14:15:28.265032Z"
    }
   },
   "outputs": [],
   "source": [
    "results_path = Path('results', 'asset_pricing')\n",
    "if not results_path.exists():\n",
    "    results_path.mkdir(parents=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:28.701443Z",
     "start_time": "2020-06-22T14:15:28.278914Z"
    }
   },
   "outputs": [],
   "source": [
    "prices = pd.read_hdf(results_path / 'data.h5', 'stocks/prices/adjusted')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:29.022795Z",
     "start_time": "2020-06-22T14:15:28.702378Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 18229815 entries, ('A', Timestamp('1999-11-18 00:00:00')) to ('ZYXI', Timestamp('2019-12-31 00:00:00'))\n",
      "Data columns (total 5 columns):\n",
      " #   Column  Non-Null Count     Dtype  \n",
      "---  ------  --------------     -----  \n",
      " 0   close   18229815 non-null  float64\n",
      " 1   high    18229815 non-null  float64\n",
      " 2   low     18229815 non-null  float64\n",
      " 3   open    18229815 non-null  float64\n",
      " 4   volume  18229815 non-null  float64\n",
      "dtypes: float64(5)\n",
      "memory usage: 765.9+ MB\n"
     ]
    }
   ],
   "source": [
    "prices.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:29.108821Z",
     "start_time": "2020-06-22T14:15:29.023650Z"
    }
   },
   "outputs": [],
   "source": [
    "metadata = pd.read_hdf(results_path / 'data.h5', 'stocks/info').rename(columns=str.lower)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:29.115258Z",
     "start_time": "2020-06-22T14:15:29.110250Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 4288 entries, A to ZYNE\n",
      "Columns: 105 entries, zip to underlyingsymbol\n",
      "dtypes: bool(2), float64(68), int64(7), object(28)\n",
      "memory usage: 3.4+ MB\n"
     ]
    }
   ],
   "source": [
    "metadata.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select tickers with metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:29.126507Z",
     "start_time": "2020-06-22T14:15:29.116392Z"
    }
   },
   "outputs": [],
   "source": [
    "sectors = (metadata.sector.value_counts() > 50).index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:29.135209Z",
     "start_time": "2020-06-22T14:15:29.127429Z"
    }
   },
   "outputs": [],
   "source": [
    "tickers_with_errors = ['FTAI', 'AIRT', 'CYBR', 'GRAF', 'KTB']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:29.148401Z",
     "start_time": "2020-06-22T14:15:29.136071Z"
    }
   },
   "outputs": [],
   "source": [
    "tickers_with_metadata = metadata[metadata.sector.isin(sectors) & \n",
    "                                 metadata.marketcap.notnull() &\n",
    "                                 metadata.sharesoutstanding.notnull() & \n",
    "                                (metadata.sharesoutstanding > 0)].index.drop(tickers_with_errors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:15:29.157084Z",
     "start_time": "2020-06-22T14:15:29.149620Z"
    }
   },
   "outputs": [],
   "source": [
    "metadata = metadata.loc[tickers_with_metadata, ['sector', 'sharesoutstanding', 'marketcap']]\n",
    "metadata.index.name = 'ticker'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:20:57.486909Z",
     "start_time": "2020-06-22T14:15:29.158103Z"
    }
   },
   "outputs": [],
   "source": [
    "prices = prices.loc[idx[tickers_with_metadata, :], :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:20:57.763560Z",
     "start_time": "2020-06-22T14:20:57.488134Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 13563058 entries, ('A', Timestamp('1999-11-18 00:00:00')) to ('ZYME', Timestamp('2019-12-31 00:00:00'))\n",
      "Data columns (total 5 columns):\n",
      " #   Column  Non-Null Count     Dtype  \n",
      "---  ------  --------------     -----  \n",
      " 0   close   13563058 non-null  float64\n",
      " 1   high    13563058 non-null  float64\n",
      " 2   low     13563058 non-null  float64\n",
      " 3   open    13563058 non-null  float64\n",
      " 4   volume  13563058 non-null  float64\n",
      "dtypes: float64(5)\n",
      "memory usage: 569.3+ MB\n"
     ]
    }
   ],
   "source": [
    "prices.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:20:57.770493Z",
     "start_time": "2020-06-22T14:20:57.765974Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 4063 entries, A to ZYNE\n",
      "Data columns (total 3 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   sector             4063 non-null   object \n",
      " 1   sharesoutstanding  4063 non-null   float64\n",
      " 2   marketcap          4063 non-null   float64\n",
      "dtypes: float64(2), object(1)\n",
      "memory usage: 127.0+ KB\n"
     ]
    }
   ],
   "source": [
    "metadata.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:02.322478Z",
     "start_time": "2020-06-22T14:20:57.771412Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 7559 entries, 1990-01-02 to 2019-12-31\n",
      "Columns: 3742 entries, A to ZYME\n",
      "dtypes: float64(3742)\n",
      "memory usage: 215.9 MB\n"
     ]
    }
   ],
   "source": [
    "close = prices.close.unstack('ticker').sort_index()\n",
    "close.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:06.890724Z",
     "start_time": "2020-06-22T14:21:02.323471Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 7559 entries, 1990-01-02 to 2019-12-31\n",
      "Columns: 3742 entries, A to ZYME\n",
      "dtypes: float64(3742)\n",
      "memory usage: 215.9 MB\n"
     ]
    }
   ],
   "source": [
    "volume = prices.volume.unstack('ticker').sort_index()\n",
    "volume.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create weekly returns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:11.776060Z",
     "start_time": "2020-06-22T14:21:06.891688Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 1565 entries, 1990-01-12 to 2020-01-03\n",
      "Freq: W-FRI\n",
      "Columns: 3742 entries, A to ZYME\n",
      "dtypes: float64(3742)\n",
      "memory usage: 44.7 MB\n"
     ]
    }
   ],
   "source": [
    "returns = (prices.close\n",
    "           .unstack('ticker')\n",
    "           .resample('W-FRI').last()\n",
    "           .sort_index().pct_change().iloc[1:])\n",
    "returns.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:11.779581Z",
     "start_time": "2020-06-22T14:21:11.777434Z"
    }
   },
   "outputs": [],
   "source": [
    "dates = returns.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:12.030307Z",
     "start_time": "2020-06-22T14:21:11.780857Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAD3CAYAAADi8sSvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAYMUlEQVR4nO3df0xV9/3H8deVH7cIWEXWwDeIv6aRQph2zpm50i2ZozE6O6dsUiERRopxc9rVX1TUBmrb1NpkRF1r3T/YhRL3y2zdr5p2Jit1i06ZeFkzhyTopRNr6gXkcoHz/WPxTityD5cD3PvZ8/HXuZfD+77ffLgvTg73nuuyLMsSAMAIE8a7AQCAcwh1ADAIoQ4ABiHUAcAghDoAGCR2PB/83LlzcrvdkiS/3x/cjlbRPkO09y8xQ6RghtHl9/s1f/78Qb82rqHudruVlZUlSfJ4PMHtaBXtM0R7/xIzRApmGF0ej+e+X+P0CwAYhFAHAIMQ6gBgEEIdAAxCqAOAQQh1ADAIoQ4ABiHUAcAgtt58dP78ee3fv1+1tbXasmWLOjo6JElXrlzR5z73Ob366quqrq7W2bNnlZiYKEk6dOiQkpOTR69zAMA9Qob6kSNHdOLECSUkJEiSXn31VUnSJ598ouLiYu3cuVOS1NTUpDfeeEMpKSmj2O5/fdLdK5+/z5Faye5YPTgx3pFaADCeQoZ6ZmamampqtG3btrvur6mp0bp16/TQQw9pYGBAra2t2r17tzo6OrR69WqtXr161JqWJJ+/T6c+7HCkVt7cVEIdgBFChnp+fr7a2truuu/69etqaGgIHqV3d3dr3bp1Wr9+vfr7+1VcXKycnBzNmzdvyNp+vz94DYOenp4hr2fwab2xSfK2e23vP5TrKS752ltHXGe4M0SaaO9fYoZIwQzjJ6wLev3ud7/T8uXLFRMTI0lKSEhQcXFx8BTN4sWL1dzcHDLUR3JBr7Yb3UpPc+bjVaemTlXGlGkjrhPJFwCyI9r7l5ghUjDD6HL8gl4NDQ3Ky8sL3r58+bIKCwvV39+vQCCgs2fPKjs7O5zSAIARCOtIvaWlRdOm/ffIdvbs2VqxYoUKCgoUFxenlStXas6cOY41CQCwx1aoZ2RkqL6+Pnj7N7/5zT37lJWVqayszLnOAADDxpuPAMAghDoAGIRQBwCDEOoAYBBCHQAMQqgDgEEIdQAwCKEOAAYh1AHAIIQ6ABiEUAcAgxDqAGAQQh0ADEKoA4BBCHUAMAihDgAGIdQBwCCEOgAYhFAHAIMQ6gBgEEIdAAxCqAOAQWyF+vnz51VUVCRJampq0qOPPqqioiIVFRXp7bffliTV19dr1apVKigo0Lvvvjt6HQMA7is21A5HjhzRiRMnlJCQIEm6ePGi1q9fr5KSkuA+165dU21trX72s5/J7/ersLBQS5YsUXx8/Oh1DnzKJ9296o1NUtuN7hHXSnbH6sGJ/P4i+oQM9czMTNXU1Gjbtm2SpAsXLqilpUUnT57U9OnTVVFRocbGRi1YsEDx8fGKj49XZmammpublZubO2Rtv98vj8cjSerp6Qlu29EbmyRvu9f2/kO5nuKSr711xHWGO0Okifb+e2OT9LvzrYqNuzriWl97+P8U39fpQFfDF+3rIDHDeAoZ6vn5+Wprawvezs3N1Zo1a5STk6PDhw/r4MGDmjdvnpKTk4P7JCYmqrMz9BPC7XYrKytLkuTxeILbdrTd6FZ6mmV7/6FMTZ2qjCnTRlxnuDOMpU+6e+Xz9w25T2/HdSWnTg1ZK1KPYttudCs27qrS09JHXMup34lwRPLvkV3MMLqG+mMTMtQ/benSpZo0aVJwu6qqSgsXLlRXV1dwn66urrtCHuPP5+/TqQ87htzH2+619Ycyb25qRIY6gDBe/VJaWqrGxkZJUkNDg7Kzs5Wbm6szZ87I7/fL5/Pp0qVLmjt3ruPNAgCGNuwj9b1796qqqkpxcXFKTU1VVVWVkpKSVFRUpMLCQlmWpS1btsjtdo9GvwCAIdgK9YyMDNXX10uSsrOzVVdXd88+BQUFKigocLY7AMCw8OYjADAIoQ4ABiHUAcAghDoAGIRQBwCDEOoAYBBCHQAMQqgDgEGG/Y5SjB07F+Gyyx/od6QOgMhGqEcwOxfhsmtB5mRH6gCIbJx+AQCDEOoAYBBCHQAMwjl1h/HPTQDjiVB3GP/cBDCeOP0CAAYh1AHAIIQ6ABiEc+qS+voH1Haje8R1emOT+OcmgHFFqEu6FRjQ3y59POI63navli160IGOACA8nH4BAIPYOlI/f/689u/fr9raWnk8HlVVVSkmJkbx8fF66aWXlJqaqurqap09e1aJiYmSpEOHDik5OXlUmwcA3C1kqB85ckQnTpxQQkKCJOn5559XZWWlsrKyVFdXpyNHjmjnzp1qamrSG2+8oZSUlFFvGgAwuJCnXzIzM1VTUxO8feDAAWVlZUmS+vv75Xa7NTAwoNbWVu3evVvf+c53dPz48dHrGABwXyGP1PPz89XW1ha8/dBDD0mSzp49q2PHjunNN99Ud3e31q1bp/Xr16u/v1/FxcXKycnRvHnzhqzt9/vl8XgkST09PcFtO3pjk+Rt99refyjzUuMdqdUXCOhWd3fE9WW3Vl8gYOvxrqe45GtvdaQvJ/XGJtmeIZTxnHG4z4VIxAzjJ6xXv7z99ts6fPiwXn/9daWkpASD/PYpmsWLF6u5uTlkqLvd7uBRv8fjCW7b0XajW+lpVjjt3yNh4kSlp6WPuI633etYLcm5vuzW8rZ7bT3e1NSpypgyzZG+nNR2o1uxcVcd+ZmN54zDfS5EImYYXUP9sRn2q19+9atf6dixY6qtrdW0af/5pb98+bIKCwvV39+vQCCgs2fPKjs7O/yOAQBhGdaRen9/v55//nmlp6fr+9//viTpC1/4gjZt2qQVK1aooKBAcXFxWrlypebMmTMqDQMA7s9WqGdkZKi+vl6S9Je//GXQfcrKylRWVuZcZwCAYePNRwBgEEIdAAxCqAOAQQh1ADAIoQ4ABiHUAcAghDoAGIRQBwCDEOoAYBBCHQAMwmeUAoNw6sPIJSnZHasHJ8Y7UgsIhVDHsP0vBJ5TH0YuSXlzUyNyRpiJUMewORl4X5qdIp+/z5Fa/kC/I3WAaEaoY1w5+QdiQeZkR+oA0Yx/lAKAQQh1ADAIoQ4ABiHUAcAghDoAGIRQBwCDEOoAYBBboX7+/HkVFRVJklpbW7V27VoVFhZqz549GhgYkCTV19dr1apVKigo0Lvvvjt6HQMA7itkqB85ckS7du2S3++XJL3wwgvavHmzfvrTn8qyLJ08eVLXrl1TbW2t6urqdPToUR04cEC9vb2j3jwA4G4hQz0zM1M1NTXB201NTVq0aJEkKS8vT++//74aGxu1YMECxcfHKzk5WZmZmWpubh69rgEAgwp5mYD8/Hy1tbUFb1uWJZfLJUlKTEyUz+dTZ2enkpOTg/skJiaqs7Mz5IP7/X55PB5JUk9PT3Dbjt7YJHnbvbb3H8q81HhHavUFArrV3R1xfdmt1RcI2Hq8se5rOLXszmCnllN9XU9xydfeanv/4T4XIhEzjJ9hX/tlwoT/Htx3dXVp0qRJSkpKUldX11333xny9+N2u5WVlSVJ8ng8wW072m50Kz3NGkbn95cwcaLS09JHXMfb7nWsluRcX3Zredu9th4vkmeMjYtzpJ6TfU1NnaqMKdNs7z/c50IkYobRNdQfm2G/+uXhhx/W6dOnJUmnTp3SwoULlZubqzNnzsjv98vn8+nSpUuaO3du+B0DAMIy7CP17du3q7KyUgcOHNCsWbOUn5+vmJgYFRUVqbCwUJZlacuWLXK73aPRLwBgCLZCPSMjQ/X19ZKkmTNn6tixY/fsU1BQoIKCAme7AwAMC28+AgCDEOoAYBBCHQAMQqgDgEEIdQAwCKEOAAYh1AHAIIQ6ABiEUAcAgxDqAGAQQh0ADEKoA4BBCHUAMAihDgAGIdQBwCCEOgAYhFAHAIMQ6gBgEEIdAAxCqAOAQQh1ADBIbDjf9POf/1y/+MUvJEl+v18ej0d1dXUqLy/XjBkzJElr167VsmXLHGsUABBaWKG+atUqrVq1SpL03HPP6Vvf+pYuXryo9evXq6SkxNEGAQD2jej0y9///nf985//1Le//W1duHBB7733np588klVVFSos7PTqR4BADaFdaR+22uvvaaNGzdKknJzc7VmzRrl5OTo8OHDOnjwoLZv3z7k998+dSNJPT09wW07emOT5G33ht/8HealxjtSqy8Q0K3u7ojry26tvkDA1uONdV/DqWV3Bju1nOrreopLvvZW2/sP97kQiZhh/IQd6jdv3tS//vUvLV68WJK0dOlSTZo0KbhdVVUVsobb7VZWVpYkyePxBLftaLvRrfQ0K4zO75UwcaLS09JHXMfb7nWsluRcX3Zredu9th4vkmeMjYtzpJ6TfU1NnaqMKdNs7z/c50IkYobRNdQfm7BPv/z1r3/Vl770peDt0tJSNTY2SpIaGhqUnZ0dbmkAQJjCPlJvaWlRRkZG8PbevXtVVVWluLg4paam2jpSBwA4K+xQ/+53v3vX7ezsbNXV1Y24IQBA+HjzEQAYZESvfgEQ3T7p7pXP3+dIrWR3rB6cGO9ILYSPUAf+h/n8fTr1YYcjtfLmphLqEYDTLwBgEEIdAAxCqAOAQQh1ADAIoQ4ABiHUAcAghDoAGIRQBwCDEOoAYBBCHQAMQqgDgEG49gsQZZy8CJc/0O9IHUQOQh2IMk5ehGtB5mRH6iBycPoFAAzCkToAR/T1D6jtRrckqTc2KbgdDq7NHj5CHRhld4adHaECMVLPg98KDOhvlz6WJHnbvUpPs8KuxbXZw0eoA6PszrCzI1Qgch4cQ+GcOgAYhFAHAIOEffrliSeeUHJysiQpIyND5eXl2rFjh1wul+bMmaM9e/ZowgT+ZgDAWAor1P1+vySptrY2eF95ebk2b96sL37xi9q9e7dOnjyppUuXOtMlAMCWsEK9ublZt27dUklJifr6+vT000+rqalJixYtkiTl5eXpz3/+c8hQ9/v98ng8kqSenp7gth29sUnytnvDaf8e81LjHanVFwjoVnd3xPVlt1ZfIGDr8ca6r+HUsjuDnVrjNWOoGSL553+71kjX4XqKS772Vkf6CtdwMylShBXqDzzwgEpLS7VmzRpdvnxZZWVlsixLLpdLkpSYmCifzxeyjtvtVlZWliTJ4/EEt+1ou9E9opdM3Slh4kSlp6WPuI633etYLcm5vuzW+s+rLkI/XiTPGBsX50i98Zwx1DpE8s//di27v0v3MzV1qjKmTHOkr3ANN5PG0lB/bMIK9ZkzZ2r69OlyuVyaOXOmJk+erKampuDXu7q6NGnSpHBKAwBGIKz/ZB4/flwvvviiJOmjjz5SZ2enlixZotOnT0uSTp06pYULFzrXJQDAlrCO1FevXq2dO3dq7dq1crlc2rdvn6ZMmaLKykodOHBAs2bNUn5+vtO9AgBCCCvU4+Pj9corr9xz/7Fjx0bcEAAgfLyQHAAMQqgDgEEIdQAwCKEOAAYh1AHAIIQ6ABiEUAcAg/DJRwCM9kl3r3z+vmF/32AfKxgNn51KqAMwms/fp1Mfdgz7+wb7WMFo+OxUTr8AgEE4UgcQcfr6B+459REuf6DfkTqSs32N1qkcQh1AxLkVGNDfLn3sSK0FmZMdqSM529doncrh9AsAGIRQBwCDEOoAYBBCHQAMQqgDgEEIdQAwCKEOAAYh1AHAIIQ6ABgkrHeUBgIBVVRU6MqVK+rt7dWGDRuUlpam8vJyzZgxQ5K0du1aLVu2zMleAQAhhBXqJ06c0OTJk/Xyyy/rxo0b+uY3v6mNGzdq/fr1KikpcbpHAIBNYYX6448/rvz8/ODtmJgYXbhwQS0tLTp58qSmT5+uiooKJSUlOdYoACC0sEI9MTFRktTZ2alNmzZp8+bN6u3t1Zo1a5STk6PDhw/r4MGD2r59+5B1/H6/PB6PJKmnpye4bUdvbJK87d5w2r/HvNR4R2r1BQK61d0dcX3ZrdUXCNh6vLHuazi17M5gp9Z4zRhqhkj++d+uNdJ1iIQZB5vByb6up7jka291pNadwr5Ko9fr1caNG1VYWKgVK1bo5s2bmjRpkiRp6dKlqqqqClnD7XYrKytLkuTxeILbdrTd6L7nAvbhSpg4Uelp6SOu4233OlZLcq4vu7X+86EAoR8vkmeMjYtzpN54zhhqHSL553+7lt3fpbHuazgGm8HJvqamTlXGlGlhfe9QB8Bhvfqlo6NDJSUl2rp1q1avXi1JKi0tVWNjoySpoaFB2dnZ4ZQGAIxAWEfqP/7xj3Xz5k0dOnRIhw4dkiTt2LFD+/btU1xcnFJTU20dqQMAnBVWqO/atUu7du265/66uroRNwQACB9vPgIAgxDqAGAQQh0ADEKoA4BBCHUAMAihDgAGIdQBwCCEOgAYhFAHAIMQ6gBgEEIdAAxCqAOAQQh1ADAIoQ4ABiHUAcAghDoAGIRQBwCDEOoAYBBCHQAMQqgDgEEIdQAwSKyTxQYGBrR371794x//UHx8vKqrqzV9+nQnHwIAMARHj9Tfeecd9fb26q233tIPf/hDvfjii06WBwCE4GionzlzRo8++qgkaf78+bpw4YKT5QEAIbgsy7KcKvbss8/q61//uh577DFJ0le+8hW98847io0d/CzPuXPn5Ha7nXp4APif4Pf7NX/+/EG/5ug59aSkJHV1dQVvDwwM3DfQJd23KQBAeBw9/fLII4/o1KlTkv5zFD537lwnywMAQnD09MvtV798+OGHsixL+/bt0+zZs50qDwAIwdFQBwCML958BAAGIdQBwCCEOgAYxNGXNIbyxBNPKDk5WZKUkZGh8vJy7dixQy6XS3PmzNGePXs0YcIE1dfXq66uTrGxsdqwYYO++tWvjmWb9zh//rz279+v2tpatba22u65p6dHW7du1fXr15WYmKiXXnpJKSkp4z5DU1OTysvLNWPGDEnS2rVrtWzZsoidIRAIqKKiQleuXFFvb682bNigz372s1G1DoPNkJaWFlXr0N/fr127dqmlpUUxMTF64YUXZFlWVK3DYDP4fL6oWoeQrDHS09NjrVy58q77nnrqKeuDDz6wLMuyKisrrT/84Q/Wv//9b2v58uWW3++3bt68GdweL6+//rq1fPlya82aNcPu+Sc/+Yn1ox/9yLIsy/r1r39tVVVVRcQM9fX11tGjR+/aJ5JnOH78uFVdXW1ZlmV9/PHH1mOPPRZ16zDYDNG2Dn/84x+tHTt2WJZlWR988IFVXl4edesw2AzRtg6hjNnpl+bmZt26dUslJSUqLi7WuXPn1NTUpEWLFkmS8vLy9P7776uxsVELFixQfHy8kpOTlZmZqebm5rFq8x6ZmZmqqakJ3h5Oz3deNiEvL08NDQ0RMcOFCxf03nvv6cknn1RFRYU6OzsjeobHH39cP/jBD4K3Y2Jiom4dBpsh2tbha1/7mqqqqiRJV69eVWpqatStw2AzRNs6hDJmof7AAw+otLRUR48e1XPPPadnnnlGlmXJ5XJJkhITE+Xz+dTZ2Rk8RXP7/s7OzrFq8x75+fl3vSt2OD3fef/tfcfDp2fIzc3Vtm3b9Oabb2ratGk6ePBgRM+QmJiopKQkdXZ2atOmTdq8eXPUrcNgM0TbOkhSbGystm/frqqqKuXn50fdOgw2QzSuw1DGLNRnzpypb3zjG3K5XJo5c6YmT56s69evB7/e1dWlSZMm3XOpga6urrt+uONtwoT//shC9Xzn/bf3jQRLly5VTk5OcPvixYsRP4PX61VxcbFWrlypFStWROU6fHqGaFwHSXrppZf0+9//XpWVlfL7/cH7o2UdpLtn+PKXvxyV63A/Yxbqx48fD16K96OPPlJnZ6eWLFmi06dPS5JOnTqlhQsXKjc3V2fOnJHf75fP59OlS5ci6nIDDz/8sO2eH3nkEf3pT38K7vv5z39+PFsPKi0tVWNjoySpoaFB2dnZET1DR0eHSkpKtHXrVq1evVpS9K3DYDNE2zr88pe/1GuvvSZJSkhIkMvlUk5OTlStw2AzfO9734uqdQhlzN5R2tvbq507d+rq1atyuVx65plnNGXKFFVWVioQCGjWrFmqrq5WTEyM6uvr9dZbb8myLD311FPKz88fixbvq62tTU8//bTq6+vV0tJiu+dbt25p+/btunbtmuLi4vTKK6/oM5/5zLjP0NTUpKqqKsXFxSk1NVVVVVVKSkqK2Bmqq6v129/+VrNmzQre9+yzz6q6ujpq1mGwGTZv3qyXX345atahu7tbO3fuVEdHh/r6+lRWVqbZs2dH1fNhsBnS09Oj6vkQCpcJAACD8OYjADAIoQ4ABiHUAcAghDoAGIRQBwCDEOoAYBBCHQAM8v9BRHJkf0ZvrwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.distplot(returns.count(1), kde=False);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:12.218377Z",
     "start_time": "2020-06-22T14:21:12.031529Z"
    }
   },
   "outputs": [],
   "source": [
    "with pd.HDFStore(results_path / 'autoencoder.h5') as store:\n",
    "    store.put('close', close)\n",
    "    store.put('volume', volume)\n",
    "    store.put('returns', returns)\n",
    "    store.put('metadata', metadata)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Factor Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:12.222809Z",
     "start_time": "2020-06-22T14:21:12.219417Z"
    }
   },
   "outputs": [],
   "source": [
    "MONTH = 21"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Price Trend"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Short-Term Reversal"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1-month cumulative return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:12.232878Z",
     "start_time": "2020-06-22T14:21:12.225202Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatetimeIndex(['1990-01-12', '1990-01-19', '1990-01-26', '1990-02-02',\n",
       "               '1990-02-09'],\n",
       "              dtype='datetime64[ns]', name='date', freq='W-FRI')"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dates[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:12.807146Z",
     "start_time": "2020-06-22T14:21:12.235147Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2804201 entries, (Timestamp('1990-02-02 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Dtype  \n",
      "---  ------  -----  \n",
      " 0   mom1m   float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 32.1+ MB\n"
     ]
    }
   ],
   "source": [
    "mom1m = close.pct_change(periods=MONTH).resample('W-FRI').last().stack().to_frame('mom1m')\n",
    "mom1m.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:12.827450Z",
     "start_time": "2020-06-22T14:21:12.808341Z"
    }
   },
   "outputs": [],
   "source": [
    "mom1m.squeeze().to_hdf(results_path / 'autoencoder.h5', 'factor/mom1m')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Stock Momentum"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "11-month cumulative returns ending 1-month before month end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:13.379746Z",
     "start_time": "2020-06-22T14:21:12.828908Z"
    }
   },
   "outputs": [],
   "source": [
    "mom12m = (close\n",
    "            .pct_change(periods=11 * MONTH)\n",
    "            .shift(MONTH)\n",
    "            .resample('W-FRI')\n",
    "            .last()\n",
    "            .stack()\n",
    "            .to_frame('mom12m'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:13.422771Z",
     "start_time": "2020-06-22T14:21:13.380673Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2631702 entries, (Timestamp('1991-01-04 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   mom12m  2631702 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 30.2+ MB\n"
     ]
    }
   ],
   "source": [
    "mom12m.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:13.476247Z",
     "start_time": "2020-06-22T14:21:13.423954Z"
    }
   },
   "outputs": [],
   "source": [
    "mom12m.to_hdf(results_path / 'autoencoder.h5', 'factor/mom12m')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Momentum Change"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cumulative return from months t-6 to t-1 minus months t-12 to t-7."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:14.285458Z",
     "start_time": "2020-06-22T14:21:13.477233Z"
    }
   },
   "outputs": [],
   "source": [
    "chmom = (close\n",
    "         .pct_change(periods=6 * MONTH)\n",
    "         .sub(close.pct_change(periods=6 * MONTH).shift(6 * MONTH))\n",
    "         .resample('W-FRI')\n",
    "         .last()\n",
    "         .stack()\n",
    "         .to_frame('chmom'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:14.329782Z",
     "start_time": "2020-06-22T14:21:14.286622Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2631702 entries, (Timestamp('1991-01-04 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   chmom   2631702 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 30.2+ MB\n"
     ]
    }
   ],
   "source": [
    "chmom.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:14.350382Z",
     "start_time": "2020-06-22T14:21:14.330740Z"
    }
   },
   "outputs": [],
   "source": [
    "chmom.to_hdf(results_path / 'autoencoder.h5', 'factor/chmom')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Industry Momentum"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Equal-weighted avg. industry 12-month returns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:15.184318Z",
     "start_time": "2020-06-22T14:21:14.351381Z"
    }
   },
   "outputs": [],
   "source": [
    "indmom = (close.pct_change(12*MONTH)\n",
    "          .resample('W-FRI')\n",
    "          .last()\n",
    "          .stack()\n",
    "          .to_frame('close')\n",
    "          .join(metadata[['sector']]).groupby(['date', 'sector'])\n",
    "          .close.mean()\n",
    "          .to_frame('indmom')\n",
    "          .reset_index())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:15.191212Z",
     "start_time": "2020-06-22T14:21:15.185168Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 18204 entries, 0 to 18203\n",
      "Data columns (total 3 columns):\n",
      " #   Column  Non-Null Count  Dtype         \n",
      "---  ------  --------------  -----         \n",
      " 0   date    18204 non-null  datetime64[ns]\n",
      " 1   sector  18204 non-null  object        \n",
      " 2   indmom  18204 non-null  float64       \n",
      "dtypes: datetime64[ns](1), float64(1), object(1)\n",
      "memory usage: 426.8+ KB\n"
     ]
    }
   ],
   "source": [
    "indmom.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:16.040903Z",
     "start_time": "2020-06-22T14:21:15.192557Z"
    }
   },
   "outputs": [],
   "source": [
    "indmom = (returns\n",
    "          .stack()\n",
    "          .to_frame('ret')\n",
    "          .join(metadata[['sector']])\n",
    "          .reset_index()\n",
    "          .merge(indmom)\n",
    "          .set_index(['date', 'ticker'])\n",
    "          .loc[:, ['indmom']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:16.049584Z",
     "start_time": "2020-06-22T14:21:16.041835Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2783776 entries, (Timestamp('1991-01-04 00:00:00'), 'AA') to (Timestamp('2020-01-03 00:00:00'), 'IIM')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   indmom  2783776 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 31.9+ MB\n"
     ]
    }
   ],
   "source": [
    "indmom.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:16.079787Z",
     "start_time": "2020-06-22T14:21:16.050892Z"
    }
   },
   "outputs": [],
   "source": [
    "indmom.to_hdf(results_path / 'autoencoder.h5', 'factor/indmom')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Recent Max Return"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Max daily returns from calendar month t-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:17.054646Z",
     "start_time": "2020-06-22T14:21:16.080742Z"
    }
   },
   "outputs": [],
   "source": [
    "maxret = (close\n",
    "           .pct_change(periods=MONTH)\n",
    "           .rolling(21)\n",
    "           .max()\n",
    "           .resample('W-FRI')\n",
    "           .last()\n",
    "           .stack()\n",
    "           .to_frame('maxret'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:17.102330Z",
     "start_time": "2020-06-22T14:21:17.055840Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2788745 entries, (Timestamp('1990-03-02 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   maxret  2788745 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 32.0+ MB\n"
     ]
    }
   ],
   "source": [
    "maxret.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:17.124433Z",
     "start_time": "2020-06-22T14:21:17.103463Z"
    }
   },
   "outputs": [],
   "source": [
    "maxret.to_hdf(results_path / 'autoencoder.h5', 'factor/maxret')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Long-Term Reversal"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cumulative returns months t-36 to t-13."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:17.645632Z",
     "start_time": "2020-06-22T14:21:17.126322Z"
    }
   },
   "outputs": [],
   "source": [
    "mom36m = (close\n",
    "           .pct_change(periods=24*MONTH)\n",
    "           .shift(12*MONTH)\n",
    "           .resample('W-FRI')\n",
    "           .last()\n",
    "           .stack()\n",
    "           .to_frame('mom36m'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:17.679992Z",
     "start_time": "2020-06-22T14:21:17.646409Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2295974 entries, (Timestamp('1993-01-01 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZUMZ')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   mom36m  2295974 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 26.3+ MB\n"
     ]
    }
   ],
   "source": [
    "mom36m.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:17.699815Z",
     "start_time": "2020-06-22T14:21:17.681878Z"
    }
   },
   "outputs": [],
   "source": [
    "mom36m.to_hdf(results_path / 'autoencoder.h5', 'factor/mom36m')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Liquidity Metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Turnover"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Avg. monthly trading volume for most recent three months scaled by number of shares; we are using the most recent no of shares from yahoo finance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:19.195222Z",
     "start_time": "2020-06-22T14:21:17.701013Z"
    }
   },
   "outputs": [],
   "source": [
    "turn = (volume\n",
    "        .rolling(3*MONTH)\n",
    "        .mean()\n",
    "        .resample('W-FRI')\n",
    "        .last()\n",
    "        .div(metadata.sharesoutstanding)\n",
    "        .stack('ticker')\n",
    "        .to_frame('turn'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:19.240604Z",
     "start_time": "2020-06-22T14:21:19.196062Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2746275 entries, (Timestamp('1990-03-30 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   turn    2746275 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 31.5+ MB\n"
     ]
    }
   ],
   "source": [
    "turn.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:19.265393Z",
     "start_time": "2020-06-22T14:21:19.242428Z"
    }
   },
   "outputs": [],
   "source": [
    "turn.to_hdf(results_path / 'autoencoder.h5', 'factor/turn')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Turnover Volatility"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Monthly std dev of daily share turnover"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:25.649100Z",
     "start_time": "2020-06-22T14:21:19.266679Z"
    }
   },
   "outputs": [],
   "source": [
    "turn_std = (prices\n",
    "            .volume\n",
    "            .unstack('ticker')\n",
    "            .div(metadata.sharesoutstanding)\n",
    "            .rolling(MONTH)\n",
    "            .std()\n",
    "            .resample('W-FRI')\n",
    "            .last()\n",
    "            .stack('ticker')\n",
    "            .to_frame('turn_std'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:25.671667Z",
     "start_time": "2020-06-22T14:21:25.649999Z"
    }
   },
   "outputs": [],
   "source": [
    "turn_std.to_hdf(results_path / 'autoencoder.h5', 'factor/turn_std')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Log Market Equity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Natural log of market cap at end of month t-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:27.915418Z",
     "start_time": "2020-06-22T14:21:25.672801Z"
    }
   },
   "outputs": [],
   "source": [
    "last_price = close.ffill()\n",
    "factor = close.div(last_price.iloc[-1])\n",
    "mvel = np.log1p(factor.mul(metadata.marketcap).resample('W-FRI').last()).stack().to_frame('mvel')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:27.961823Z",
     "start_time": "2020-06-22T14:21:27.916244Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2819247 entries, (Timestamp('1990-01-05 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   mvel    2819247 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 32.3+ MB\n"
     ]
    }
   ],
   "source": [
    "mvel.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:27.986448Z",
     "start_time": "2020-06-22T14:21:27.963757Z"
    }
   },
   "outputs": [],
   "source": [
    "mvel.to_hdf(results_path / 'autoencoder.h5', 'factor/mvel')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Dollar Volume"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Natural log of trading volume time price per share from month t-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:28.044030Z",
     "start_time": "2020-06-22T14:21:27.987955Z"
    }
   },
   "outputs": [],
   "source": [
    "dv = close.mul(volume)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:28.842238Z",
     "start_time": "2020-06-22T14:21:28.045390Z"
    }
   },
   "outputs": [],
   "source": [
    "dolvol = (np.log1p(dv.rolling(21)\n",
    "                  .mean()\n",
    "                  .shift(21)\n",
    "                  .resample('W-FRI')\n",
    "                  .last())\n",
    "          .stack()\n",
    "          .to_frame('dolvol'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:28.866377Z",
     "start_time": "2020-06-22T14:21:28.843332Z"
    }
   },
   "outputs": [],
   "source": [
    "dolvol.to_hdf(results_path / 'autoencoder.h5', 'factor/dolvol')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Amihud Illiquidity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Average of daily (absolute return / dollar volume)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:29.989756Z",
     "start_time": "2020-06-22T14:21:28.867523Z"
    }
   },
   "outputs": [],
   "source": [
    "ill = (close.pct_change().abs()\n",
    "       .div(dv)\n",
    "       .rolling(21)\n",
    "       .mean()\n",
    "       .resample('W-FRI').last()\n",
    "       .stack()\n",
    "       .to_frame('ill'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:30.033205Z",
     "start_time": "2020-06-22T14:21:29.990692Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2526364 entries, (Timestamp('1990-02-02 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   ill     2526364 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 29.0+ MB\n"
     ]
    }
   ],
   "source": [
    "ill.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:30.053118Z",
     "start_time": "2020-06-22T14:21:30.034220Z"
    }
   },
   "outputs": [],
   "source": [
    "ill.to_hdf(results_path / 'autoencoder.h5', 'factor/ill')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Risk Measures"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Return Volatility"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Standard dev of daily returns from month t-1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:31.057105Z",
     "start_time": "2020-06-22T14:21:30.054097Z"
    }
   },
   "outputs": [],
   "source": [
    "retvol = (close.pct_change()\n",
    "          .rolling(21)\n",
    "          .std()\n",
    "          .resample('W-FRI')\n",
    "          .last()\n",
    "          .stack()\n",
    "          .to_frame('retvol'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:31.102247Z",
     "start_time": "2020-06-22T14:21:31.058136Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2804201 entries, (Timestamp('1990-02-02 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZYME')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   retvol  2804201 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 32.1+ MB\n"
     ]
    }
   ],
   "source": [
    "retvol.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:31.122100Z",
     "start_time": "2020-06-22T14:21:31.103433Z"
    }
   },
   "outputs": [],
   "source": [
    "retvol.to_hdf(results_path / 'autoencoder.h5', 'factor/retvol')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Market Beta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Estimated market beta from weekly returns and equal weighted market returns for 3 years ending month t-1 with at least 52 weeks of returns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:31.360093Z",
     "start_time": "2020-06-22T14:21:31.123130Z"
    }
   },
   "outputs": [],
   "source": [
    "index = close.resample('W-FRI').last().pct_change().mean(1).to_frame('x')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:21:31.364227Z",
     "start_time": "2020-06-22T14:21:31.361050Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_market_beta(y, x=index):\n",
    "    df = x.join(y.to_frame('y')).dropna()\n",
    "    model = RollingOLS(endog=df.y, \n",
    "                       exog=sm.add_constant(df[['x']]),\n",
    "                      window=3*52)\n",
    "\n",
    "    return model.fit(params_only=True).params['x']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:22:26.558186Z",
     "start_time": "2020-06-22T14:21:31.365415Z"
    }
   },
   "outputs": [],
   "source": [
    "beta = (returns.dropna(thresh=3*52, axis=1)\n",
    "        .apply(get_market_beta).stack().to_frame('beta'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:22:26.592183Z",
     "start_time": "2020-06-22T14:22:26.559169Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2297672 entries, (Timestamp('1993-01-01 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZUMZ')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   beta    2297672 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 26.3+ MB\n"
     ]
    }
   ],
   "source": [
    "beta.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:22:26.610834Z",
     "start_time": "2020-06-22T14:22:26.593125Z"
    }
   },
   "outputs": [],
   "source": [
    "beta.to_hdf(results_path / 'autoencoder.h5', 'factor/beta')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Beta Squared"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Market beta squared"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:22:26.621369Z",
     "start_time": "2020-06-22T14:22:26.611785Z"
    }
   },
   "outputs": [],
   "source": [
    "betasq = beta.beta.pow(2).to_frame('betasq')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:22:26.629120Z",
     "start_time": "2020-06-22T14:22:26.622393Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2297672 entries, (Timestamp('1993-01-01 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZUMZ')\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   betasq  2297672 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 26.3+ MB\n"
     ]
    }
   ],
   "source": [
    "betasq.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:22:26.647655Z",
     "start_time": "2020-06-22T14:22:26.630021Z"
    }
   },
   "outputs": [],
   "source": [
    "betasq.to_hdf(results_path / 'autoencoder.h5', 'factor/betasq')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Idiosyncratic return volatility"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Standard dev of a regression of residuals of weekly returns on the returns of an equal weighted market index returns for the prior three years."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This takes a while!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T14:22:26.658305Z",
     "start_time": "2020-06-22T14:22:26.650268Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_ols_residuals(y, x=index):\n",
    "    df = x.join(y.to_frame('y')).dropna()\n",
    "    model = sm.OLS(endog=df.y, exog=sm.add_constant(df[['x']]))\n",
    "    result = model.fit()\n",
    "    return result.resid.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T17:07:16.084427Z",
     "start_time": "2020-06-22T14:22:26.659182Z"
    }
   },
   "outputs": [],
   "source": [
    "idiovol = (returns.apply(lambda x: x.rolling(3 * 52)\n",
    "                         .apply(get_ols_residuals)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T17:07:16.210096Z",
     "start_time": "2020-06-22T17:07:16.085430Z"
    }
   },
   "outputs": [],
   "source": [
    "idiovol = idiovol.stack().to_frame('idiovol')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T17:07:16.247578Z",
     "start_time": "2020-06-22T17:07:16.211084Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2297672 entries, (Timestamp('1993-01-01 00:00:00', freq='W-FRI'), 'AA') to (Timestamp('2020-01-03 00:00:00', freq='W-FRI'), 'ZUMZ')\n",
      "Data columns (total 1 columns):\n",
      " #   Column   Non-Null Count    Dtype  \n",
      "---  ------   --------------    -----  \n",
      " 0   idiovol  2297672 non-null  float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 26.3+ MB\n"
     ]
    }
   ],
   "source": [
    "idiovol.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T17:07:16.273176Z",
     "start_time": "2020-06-22T17:07:16.248483Z"
    }
   },
   "outputs": [],
   "source": [
    "idiovol.to_hdf(results_path / 'autoencoder.h5', 'factor/idiovol')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:ml4t-dl]",
   "language": "python",
   "name": "conda-env-ml4t-dl-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
